{
 "cells": [
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-01-16T07:06:27.546613Z",
     "start_time": "2025-01-16T07:06:27.539534Z"
    }
   },
   "cell_type": "code",
   "source": [
    "import sys\n",
    "\n",
    "sys.path.append('../toolkit/')\n",
    "from htmlrag import clean_html\n",
    "\n",
    "question=\"When was the bellagio in las vegas built?\"\n",
    "html=\"\"\"\n",
    "<html>\n",
    "<head>\n",
    "<h1>Bellagio Hotel in Las</h1>\n",
    "</head>\n",
    "<body>\n",
    "<p class=\"class0\">The Bellagio is a luxury hotel and casino located on the Las Vegas Strip in Paradise, Nevada. It was built in 1998.</p>\n",
    "</body>\n",
    "<div>\n",
    "<div>\n",
    "<p>Some other text</p>\n",
    "<p>Some other text</p>\n",
    "</div>\n",
    "</div>\n",
    "<p class=\"class1\"></p>\n",
    "<!-- Some comment -->\n",
    "<script type=\"text/javascript\">\n",
    "document.write(\"Hello World!\");\n",
    "</script>\n",
    "</html>\n",
    "\"\"\"\n",
    "\n",
    "# html=open(\"../govdata.html\").read()\n",
    "\n",
    "simplified_html = clean_html(html)\n",
    "print(simplified_html)\n",
    "\n",
    "\n",
    "# <html>\n",
    "# <title>A short description of the Bellagio hotel</title>\n",
    "# <p>The Bellagio is a luxury hotel and casino located on the Las Vegas Strip in Paradise, Nevada. It was built in 1998.</p>\n",
    "# <div>\n",
    "# <p>Some other text</p>\n",
    "# <p>Some other text</p>\n",
    "# </div>\n",
    "# </html>"
   ],
   "id": "4391a6d075278353",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<html>\n",
      "<h1>Bellagio Hotel in Las</h1>\n",
      "<p>The Bellagio is a luxury hotel and casino located on the Las Vegas Strip in Paradise, Nevada. It was built in 1998.</p>\n",
      "<div>\n",
      "<p>Some other text</p>\n",
      "<p>Some other text</p>\n",
      "</div>\n",
      "</html>\n"
     ]
    }
   ],
   "execution_count": 21
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-01-16T07:06:28.176816Z",
     "start_time": "2025-01-16T07:06:28.173069Z"
    }
   },
   "cell_type": "code",
   "source": [
    "# Maximum number of words in a node when constructing the block tree for pruning with the embedding model\n",
    "MAX_NODE_WORDS_EMBED = 10\n",
    "# MAX_NODE_WORDS_EMBED = 256 # a recommended setting for real-world HTML documents\n",
    "# Maximum number of tokens in the output HTML document pruned with the embedding model\n",
    "MAX_CONTEXT_WINDOW_EMBED = 60\n",
    "# MAX_CONTEXT_WINDOW_EMBED = 6144 # a recommended setting for real-world HTML documents\n",
    "# Maximum number of words in a node when constructing the block tree for pruning with the generative model\n",
    "MAX_NODE_WORDS_GEN = 5\n",
    "# MAX_NODE_WORDS_GEN = 128 # a recommended setting for real-world HTML documents\n",
    "# Maximum number of tokens in the output HTML document pruned with the generative model\n",
    "MAX_CONTEXT_WINDOW_GEN = 32\n",
    "# MAX_CONTEXT_WINDOW_GEN = 4096 # a recommended setting for real-world HTML documents"
   ],
   "id": "aad73cfb61ec08c9",
   "outputs": [],
   "execution_count": 22
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-01-16T07:06:28.906918Z",
     "start_time": "2025-01-16T07:06:28.900700Z"
    }
   },
   "cell_type": "code",
   "source": [
    "from htmlrag import build_block_tree\n",
    "\n",
    "# if you have mutiple HTML documents, merge them using the following code\n",
    "# simplified_html = \"\\n\".join([clean_html(html) for html in htmls])\n",
    "block_tree, simplified_html=build_block_tree(simplified_html, max_node_words=MAX_NODE_WORDS_EMBED)\n",
    "# block_tree, simplified_html=build_block_tree(simplified_html, max_node_words=256, zh_char=True) # for Chinese text\n",
    "for block in block_tree:\n",
    "    print(\"Block Content: \", block[0])\n",
    "    print(\"Block Path: \", block[1])\n",
    "    print(\"Is Leaf: \", block[2])\n",
    "    print(\"\")\n",
    "\n",
    "# Block Content:  <title>A short description of the Bellagio hotel</title>\n",
    "# Block Path:  ['html', 'title']\n",
    "# Is Leaf:  True\n",
    "# \n",
    "# Block Content:  <div>\n",
    "# <p>Some other text</p>\n",
    "# <p>Some other text</p>\n",
    "# </div>\n",
    "# Block Path:  ['html', 'div']\n",
    "# Is Leaf:  True\n",
    "# \n",
    "# Block Content:  <p>The Bellagio is a luxury hotel and casino located on the Las Vegas Strip in Paradise, Nevada. It was built in 1998.</p>\n",
    "# Block Path:  ['html', 'p']\n",
    "# Is Leaf:  True\n"
   ],
   "id": "3b9775fe9a0f8a98",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Block Content:  <h1>Bellagio Hotel in Las</h1>\n",
      "Block Path:  ['html', 'h1']\n",
      "Is Leaf:  True\n",
      "\n",
      "Block Content:  <div>\n",
      "<p>Some other text</p>\n",
      "<p>Some other text</p>\n",
      "</div>\n",
      "Block Path:  ['html', 'div']\n",
      "Is Leaf:  True\n",
      "\n",
      "Block Content:  <p>The Bellagio is a luxury hotel and casino located on the Las Vegas Strip in Paradise, Nevada. It was built in 1998.</p>\n",
      "Block Path:  ['html', 'p']\n",
      "Is Leaf:  True\n",
      "\n"
     ]
    }
   ],
   "execution_count": 23
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-01-16T07:06:30.317204Z",
     "start_time": "2025-01-16T07:06:29.570490Z"
    }
   },
   "cell_type": "code",
   "source": [
    "from htmlrag import EmbedHTMLPruner\n",
    "\n",
    "embed_model=\"/processing_data/biz/jiejuntan/huggingface/bge-large-en/\"\n",
    "query_instruction_for_retrieval = \"Instruct: Given a web search query, retrieve relevant passages that answer the query\\nQuery: \"\n",
    "embed_html_pruner = EmbedHTMLPruner(embed_model=embed_model, local_inference=True, query_instruction_for_retrieval=query_instruction_for_retrieval)\n",
    "# alternatively you can init a remote TEI model, refer to https://github.com/huggingface/text-embeddings-inference.\n",
    "# tei_endpoint=\"http://YOUR_TEI_ENDPOINT\"\n",
    "# embed_html_pruner = EmbedHTMLPruner(embed_model=embed_model, local_inference=False, query_instruction_for_retrieval = query_instruction_for_retrieval, endpoint=tei_endpoint)\n",
    "block_rankings=embed_html_pruner.calculate_block_rankings(question, simplified_html, block_tree)\n",
    "print(block_rankings)\n",
    "\n",
    "# [2, 0, 1]"
   ],
   "id": "f56d81bc017b300e",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2, 0, 1]\n"
     ]
    }
   ],
   "execution_count": 24
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-01-16T07:06:31.565824Z",
     "start_time": "2025-01-16T07:06:31.047454Z"
    }
   },
   "cell_type": "code",
   "source": [
    "from transformers import AutoTokenizer\n",
    "\n",
    "chat_tokenizer = AutoTokenizer.from_pretrained(\"/processing_data/biz/jiejuntan/huggingface/Meta-Llama-3.1-70B-Instruct/\")\n",
    "\n",
    "pruned_html = embed_html_pruner.prune_HTML(simplified_html, block_tree, block_rankings, chat_tokenizer, MAX_CONTEXT_WINDOW_EMBED)\n",
    "# pruned_html=bm25_html_pruner.prune_HTML(simplified_html, block_tree, block_rankings, chat_tokenizer, max_context_window)\n",
    "print(pruned_html)\n",
    "\n",
    "# <html>\n",
    "# <title>A short description of the Bellagio hotel</title>\n",
    "# <p>The Bellagio is a luxury hotel and casino located on the Las Vegas Strip in Paradise, Nevada. It was built in 1998.</p>\n",
    "# </html>"
   ],
   "id": "b902c4b1f7f74c1f",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<html>\n",
      "<h1>Bellagio Hotel in Las</h1>\n",
      "<p>The Bellagio is a luxury hotel and casino located on the Las Vegas Strip in Paradise, Nevada. It was built in 1998.</p>\n",
      "</html>\n"
     ]
    }
   ],
   "execution_count": 25
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-01-16T07:06:32.205931Z",
     "start_time": "2025-01-16T07:06:32.200552Z"
    }
   },
   "cell_type": "code",
   "source": [
    "block_tree, pruned_html=build_block_tree(pruned_html, max_node_words=10)\n",
    "for block in block_tree:\n",
    "    print(\"Block Content: \", block[0])\n",
    "    print(\"Block Path: \", block[1])\n",
    "    print(\"Is Leaf: \", block[2])\n",
    "    print(\"\")\n",
    "\n",
    "# Block Content:  <title>A short description of the Bellagio hotel</title>\n",
    "# Block Path:  ['html', 'title']\n",
    "# Is Leaf:  True\n",
    "#\n",
    "# Block Content:  <p>The Bellagio is a luxury hotel and casino located on the Las Vegas Strip in Paradise, Nevada. It was built in 1998.</p>\n",
    "# Block Path:  ['html', 'p']\n",
    "# Is Leaf:  True"
   ],
   "id": "b7fc1c9281f42315",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Block Content:  <h1>Bellagio Hotel in Las</h1>\n",
      "Block Path:  ['html', 'h1']\n",
      "Is Leaf:  True\n",
      "\n",
      "Block Content:  <p>The Bellagio is a luxury hotel and casino located on the Las Vegas Strip in Paradise, Nevada. It was built in 1998.</p>\n",
      "Block Path:  ['html', 'p']\n",
      "Is Leaf:  True\n",
      "\n"
     ]
    }
   ],
   "execution_count": 26
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-01-16T07:06:33.154256Z",
     "start_time": "2025-01-16T07:06:33.069441Z"
    }
   },
   "cell_type": "code",
   "source": [
    "#. alternatively you can use bm25 to rank the blocks\n",
    "from htmlrag import BM25HTMLPruner\n",
    "bm25_html_pruner = BM25HTMLPruner()\n",
    "block_rankings=bm25_html_pruner.calculate_block_rankings(question, simplified_html, block_tree)\n",
    "print(block_rankings)\n",
    "assert len(block_rankings)==len(block_tree), f\"The number of block rankings {len(block_rankings)} should be equal to the number of blocks {len(block_tree)}\"\n",
    "# [2, 0, 1]"
   ],
   "id": "153d1675e2a7631d",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1, 0]\n"
     ]
    }
   ],
   "execution_count": 27
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-01-16T07:06:44.714698Z",
     "start_time": "2025-01-16T07:06:41.513087Z"
    }
   },
   "cell_type": "code",
   "source": [
    "from htmlrag import GenHTMLPruner\n",
    "import torch\n",
    "\n",
    "ckpt_path = \"/processing_data/biz/jiejuntan/huggingface/HTML-Pruner-Phi-3.8B\"\n",
    "# ckpt_path = \"/processing_data/biz/jiejuntan/huggingface/HTML-Pruner-Llama-1B\"\n",
    "if torch.cuda.is_available():\n",
    "    device=\"cuda\"\n",
    "else:\n",
    "    device=\"cpu\"\n",
    "gen_html_pruner = GenHTMLPruner(gen_model=ckpt_path, device=device)\n",
    "block_rankings=gen_html_pruner.calculate_block_rankings(question, pruned_html, block_tree)\n",
    "print(block_rankings)\n",
    "\n",
    "# [1, 0]"
   ],
   "id": "8c2ed17ba20f51ef",
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.66it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1, 0]\n"
     ]
    }
   ],
   "execution_count": 28
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-01-16T07:06:48.290615Z",
     "start_time": "2025-01-16T07:06:48.284544Z"
    }
   },
   "cell_type": "code",
   "source": [
    "max_context_window=32\n",
    "pruned_html=gen_html_pruner.prune_HTML(pruned_html, block_tree, block_rankings, chat_tokenizer, max_context_window)\n",
    "print(pruned_html)\n",
    "\n",
    "# <p>The Bellagio is a luxury hotel and casino located on the Las Vegas Strip in Paradise, Nevada. It was built in 1998.</p>"
   ],
   "id": "fcbeb84dfd3751a7",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<p>The Bellagio is a luxury hotel and casino located on the Las Vegas Strip in Paradise, Nevada. It was built in 1998.</p>\n"
     ]
    }
   ],
   "execution_count": 29
  },
  {
   "cell_type": "code",
   "id": "1d6bcf5600b11388",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-01-16T07:05:50.381894Z",
     "start_time": "2025-01-16T07:05:46.977342Z"
    }
   },
   "source": [
    "#. a quick start example\n",
    "html=open(\"../html_data/example/Washington Post.html\").read()\n",
    "question=\"What are the main policies or bills that Biden touted besides the American Rescue Plan?\"\n",
    "simplified_html=clean_html(html)\n",
    "block_tree, simplified_html=build_block_tree(simplified_html, max_node_words=256)\n",
    "embed_html_pruner = EmbedHTMLPruner(embed_model=embed_model, local_inference=True)\n",
    "block_rankings=embed_html_pruner.calculate_block_rankings(question, simplified_html, block_tree)\n",
    "pruned_html=embed_html_pruner.prune_HTML(simplified_html, block_tree, block_rankings, chat_tokenizer, 4096)\n",
    "gen_html_pruner = GenHTMLPruner(gen_model=ckpt_path, device=device)\n",
    "block_tree, pruned_html=build_block_tree(pruned_html, max_node_words=128)\n",
    "block_rankings=gen_html_pruner.calculate_block_rankings(question, pruned_html, block_tree)\n",
    "pruned_html=gen_html_pruner.prune_HTML(pruned_html, block_tree, block_rankings, chat_tokenizer, 2048)\n",
    "print(pruned_html)"
   ],
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<html><title>Biden warns of the rise of a new American ‘oligarchy’  - The Washington Post</title>\n",
      "<div1><p0>President Joe Biden used his final address from the Oval Office to deliver a somber warning about the threat posed by the “dangerous concentration of power” in the hands of wealthy and well-connected individuals, a thinly veiled reference to billionaire technology executives who have been increasingly signaling their desire to work closely with President-elect <a>Donald Trump</a>.</p0><p1>“Today, an oligarchy is taking shape in America of extreme wealth, power and influence that literally threatens our entire democracy, our basic rights and freedoms, and a fair shot for everyone to get ahead,” Biden said during his farewell speech, days before he steps down from a four-year presidency and a lifetime in public office. “We see the consequences all across America, and we’ve seen it before.”</p1><p2>Biden likened the current crop of tech moguls to the “robber barons” of the 19th century, men like John D. Rockefeller and Andrew Carnegie. Recalling <a>President Dwight D. Eisenhower’s farewell address</a> warning about the military-industrial complex, Biden decried a “tech-industrial complex that could pose real dangers to our country.”</p2><p3>The stark comparisons underscored how a president who has often heralded the “possibilities” afforded by America and proclaimed himself an enduring optimist is ending a 50-year career in public service with deep concerns that the nation’s promise is being eroded by its wealthiest citizens. Biden spoke from a storied location that in five days is to be occupied by Biden’s major political adversary, a man he has described as a threat to democracy and unfit for the presidency.</p3><p4>Tech executives have been <a>visiting and dining with Trump at his Mar-a-Lago estate</a> and donating millions to his inaugural committee. Tech moguls Jeff Bezos, Elon Musk and Mark Zuckerberg are <a>planning to attend Trump’s inauguration Monday</a>, with prime seating on the dais that illustrates deepening ties between the nation’s top technology leaders and the incoming administration. (Bezos, the founder of Amazon, owns The Washington Post.)</p4><p7>Biden delivered the speech with images of his family, including his deceased son, Beau, behind him. His voice at times sounded raspy and he stumbled over a few words, but he largely delivered a forceful farewell with stern warnings for the future.</p7><p8>He suggested that artificial intelligence could “spawn new threats to our rights, our way of life, to our privacy.” He lamented the “crumbling” of a free press. He warned that social media companies have given up on trying to verify the information they distribute, a reference to Zuckerberg’s recent decision to end fact-checking on Facebook and Instagram.</p8><p9>“The truth is smothered by lies told for power and for profit,” he said.</p9><p11>Still, Biden aimed to balance the warning with a sense of optimism. He recalled how he went from being a child with stutter to becoming the country’s youngest senator and then eventually vice president to the nation’s first Black president. Biden has cast his ascent to the presidency, an office he spent decades pursuing, as a uniquely American tale of opportunity and resilience.</p11><p13>While several presidents have delivered parting speeches from the White House, the setting was more staid than the arena-like atmosphere of former president Barack Obama’s farewell speech in 2017. Obama spoke to some 18,000 supporters at Chicago’s McCormick Place Convention Center, encouraging them against despair, days before Trump was set to be sworn in, and reminding allies of what they had accomplished together.</p13><p14>Obama, too, warned against what he said were the dangers posed by an incoming Trump administration, in some ways prefiguring Biden’s address Wednesday. “If every economic issue is framed as a struggle between a hardworking, White middle class and an undeserving minority, then workers of all shades will be left fighting for scraps while the wealthy withdraw further into their private enclaves,” Obama <a>said at the time</a>.</p14><p16>As Biden and his aides have done repeatedly in the weeks since Trump’s victory, the president argued that the full benefits of his agenda will only be felt in the years and decades ahead, as new roads are paved and factories built. In some ways, the speech was designed to address tomorrow’s historians as much as today’s electorate.</p16><p17>“It will take time to feel the full impact of all we’ve done together,” he said. “But the seeds are planted, and they’ll grow and they’ll bloom for decades to come.”</p17><p18>Biden listed safeguards he said should be enacted against abuse of power by leaders of technology and government. He called for revisiting the tax code to make billionaires “pay their fair share,” getting dark money out of politics, instituting term limits and ethical guardrails for the Supreme Court, and banning members of Congress from trading stocks.</p18><p21>It is not clear how Biden’s warnings will register with an electorate that soundly rejected his party in November. Leaving office with low approval ratings, Biden has had to grapple with the reality that despite his accomplishments, many Americans have chosen a sharply different direction.</p21><p22>In recent weeks, Biden has delivered legacy-focused speeches, praising his administration’s record on the economy and foreign policy and suggesting that time will prove his doubters wrong.</p22><p23>Biden is the first president since Lyndon B. Johnson to step aside rather than seek a second full term, although the widespread dissatisfaction of fellow Democratic leaders left him little choice.</p23><p24>His remarks Wednesday in some ways echoed the parting words of Johnson, who also had an ambitious domestic agenda but struggled with low approval ratings. Johnson in his 1969 farewell address sought to paint a portrait of progress during his time in office, citing a strong economy, major legislative wins on social programs and advancements for people of color.</p24><p25>Biden spoke hours after his administration had helped broker a ceasefire deal between Israel and <a>Hamas</a>, ending months of painstaking negotiations. He began his remarks by hailing the agreement, which, if implemented, would reunite nearly three dozen Israeli hostages with their families and provide war-weary Palestinians a reprieve from deadly violence in the <a>Gaza Strip</a>.</p25><p26>“This plan was developed and negotiated by my team and will be largely implemented by the incoming administration,” Biden said. “That’s why I told my team to keep the incoming administration fully informed, because that’s how it should be — working together as Americans.”</p26><p27>Biden’s message offered a sharp contrast to the dystopian view that Trump presented of the country under Biden’s stewardship. Trump, who is set to deliver his inaugural address Monday, has said Biden’s presidency has been among the worst in history. He has strayed from the facts as he has described a nation overrun with criminal migrants, crumbling under runaway inflation and governed by vindictive bureaucrats.</p27><p28>Trump, who famously used his first inaugural speech to rail against “American carnage,” has suggested his mission in a second term will be to undo many of the policies and programs Biden put in place.</p28><p29>While presidents have often used farewell remarks to speak to the challenges ahead, Biden was in the unique position of speaking days before the inauguration of a man he has long described as an existential threat to democracy.</p29><p30>“Trump looms heavily over this because November completely flipped the switch on how Biden’s going to be viewed,” said Tevi Troy, a presidential historian. “He was the dragon-slayer, the guy who defeated Trump ... and now he is the guy who let the dragon come back.”</p30><p31>For all the warnings, Biden ended his speech on an optimistic note, suggesting he still had faith that America would live up to its promise.</p31><p32>“I still believe in the idea for which this nation stands, a nation where the strength of our institutions, and the character of our people, matter and must endure,” he said. “Now it’s your turn to stand guard.”</p32></div1></html>\n"
     ]
    }
   ],
   "execution_count": 20
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.20"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
